# setwd("C:/Users/MG/Desktop/Vedecom")
library(readr)
vehicle <- read_csv("vehicle.csv", col_types = cols(X1 = col_skip()))
Number of observations 1711.
Number of columns 9.
Défintions des colonnes selon Flavien et Youenn:
library(DT)
datatable(vehicle, rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T) )
Remarque: Est ce que l’intervalle de temps entre deux trips a été pris en compte ? (Figure ci dessous du FCDM.pptx)
Renaming variables V2 V3 V4:
I called them “morning”,“day”,“night”.. But waiting confirmation about explanation of these variables (not the same in FCDM.pptx), plus: aux quelles heures àa corresponds?
library(dplyr)
vehicle = vehicle %>% rename(morning = V2, night = V3, day = V4)
vehicle = vehicle %>% select(-vehicleId)
library(tidyverse)
library(tidylog)
vehicle = vehicle %>%
mutate(TotalDurationHour = AvgDuration*NbrTrips*24,
TotalDurationMinutes = AvgDuration*NbrTrips*24*60,
AvgDurationHour = AvgDuration*24,
AvgDurationMinutes = AvgDuration*24*60,
AvgWaitingHour = AvgWaiting*24,
AvgWaitingMinutes = AvgWaiting*24*60)
Summary
library(skimr)
vehicle %>%
skim()
| Name | Piped data |
| Number of rows | 1711 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| morning | 0 | 1 | 2.24 | 2.82 | 0.00 | 0.00 | 2.00 | 3.00 | 21.00 | ▇▁▁▁▁ |
| night | 0 | 1 | 0.88 | 1.35 | 0.00 | 0.00 | 0.00 | 1.00 | 15.00 | ▇▁▁▁▁ |
| day | 0 | 1 | 1.31 | 2.08 | 0.00 | 0.00 | 0.00 | 2.00 | 20.00 | ▇▁▁▁▁ |
| NbrTrips | 0 | 1 | 4.43 | 2.91 | 2.00 | 2.00 | 4.00 | 5.00 | 25.00 | ▇▂▁▁▁ |
| AvgDuration | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.07 | ▇▁▁▁▁ |
| AvgWaiting | 0 | 1 | 0.01 | 0.02 | -0.18 | 0.00 | 0.00 | 0.01 | 0.28 | ▁▇▆▁▁ |
| distDA | 0 | 1 | 30.61 | 30.41 | 0.00 | 8.13 | 20.65 | 43.07 | 203.75 | ▇▂▁▁▁ |
| meanRatio | 0 | 1 | 1.59 | 2.98 | 0.06 | 1.19 | 1.29 | 1.41 | 90.76 | ▇▁▁▁▁ |
| TotalDurationHour | 0 | 1 | 0.74 | 0.64 | 0.00 | 0.32 | 0.55 | 0.95 | 4.66 | ▇▂▁▁▁ |
| TotalDurationMinutes | 0 | 1 | 44.52 | 38.49 | 0.10 | 19.16 | 33.15 | 56.93 | 279.32 | ▇▂▁▁▁ |
| AvgDurationHour | 0 | 1 | 0.18 | 0.15 | 0.00 | 0.09 | 0.14 | 0.23 | 1.72 | ▇▁▁▁▁ |
| AvgDurationMinutes | 0 | 1 | 10.90 | 8.72 | 0.05 | 5.38 | 8.48 | 13.69 | 103.07 | ▇▁▁▁▁ |
| AvgWaitingHour | 0 | 1 | 0.22 | 0.45 | -4.38 | 0.03 | 0.08 | 0.24 | 6.81 | ▁▇▆▁▁ |
| AvgWaitingMinutes | 0 | 1 | 12.94 | 27.02 | -262.87 | 1.88 | 4.72 | 14.43 | 408.47 | ▁▇▆▁▁ |
The columns I want to include in the study: “morning”,“day”,“night”,“NbrTrips”,“AvgDurationHour”,“AvgWaitingHour”,“distDA”,“meanRatio”,“TotalDurationHour”
colnames(vehicle)
## [1] "morning" "night" "day"
## [4] "NbrTrips" "AvgDuration" "AvgWaiting"
## [7] "distDA" "meanRatio" "TotalDurationHour"
## [10] "TotalDurationMinutes" "AvgDurationHour" "AvgDurationMinutes"
## [13] "AvgWaitingHour" "AvgWaitingMinutes"
features = c("morning","night","day","NbrTrips","AvgDurationHour","AvgWaitingHour","distDA","meanRatio","TotalDurationHour")
M = cor(vehicle[,features])
corrplot::corrplot.mixed(M, upper = "square", tl.col = "black")
About meanRatio
meanRatio n’est corrélée avec aucune variable. Bizarre ? Regardons les nuages des points. ça montre des points extremes qui causent cette corrélaton nulle. A faire: identifier ces points s’ils s’agissent des mêmes vehicules, les traiter.
# library(GGally)
# ggscatmat(vehicle)
GGally::ggpairs(vehicle[,features])
Classer voiture selon le max de trips par durée. Ok?
Calculating new variables to see colors, named category: morning or day or night with respect to max between them
vehicle$category = as.factor(colnames(vehicle[,1:3])[apply(vehicle[,1:3],1,which.max)])
features = c(features, "category")
plotly::plot_ly(vehicle, x=~morning, y=~day, z=~night, color=~category)
vehicle_melted = reshape2::melt(vehicle[,features], id.var = "category")
p <- ggplot(data = vehicle_melted, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=category))
p + facet_wrap( ~ variable, scales="free")
# library(GGally)
# ggscatmat(vehicle)
GGally::ggpairs(vehicle[,features], mapping=ggplot2::aes(colour = category))
library("FactoMineR")
library("factoextra")
res.pca = PCA(vehicle[,-15], graph = F)
get_eigenvalue(res.pca)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.258660e+00 3.041900e+01 30.41900
## Dim.2 3.152948e+00 2.252106e+01 52.94006
## Dim.3 2.260576e+00 1.614697e+01 69.08703
## Dim.4 1.507545e+00 1.076818e+01 79.85521
## Dim.5 1.331031e+00 9.507363e+00 89.36257
## Dim.6 9.947973e-01 7.105695e+00 96.46826
## Dim.7 3.170129e-01 2.264378e+00 98.73264
## Dim.8 1.774302e-01 1.267359e+00 100.00000
## Dim.9 1.391635e-28 9.940250e-28 100.00000
## Dim.10 4.818850e-30 3.442036e-29 100.00000
## Dim.11 5.300806e-31 3.786290e-30 100.00000
## Dim.12 9.193901e-32 6.567072e-31 100.00000
## Dim.13 4.224568e-32 3.017549e-31 100.00000
## Dim.14 1.559004e-32 1.113574e-31 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Évite le chevauchement de texte
)
fviz_pca_biplot(res.pca,
col.var = "black", # Couleur des variables
col.ind = vehicle$category # Couleur des individues
)
var <- get_pca_var(res.pca)
corrplot::corrplot(var$contrib, is.corr=FALSE)
# Contributions des variables à PC1
contrib1 = fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)
# Contributions des variables à PC2
contrib2 = fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)
gridExtra::grid.arrange(contrib1, contrib2, ncol=2)
fviz_pca_biplot(res.pca,
col.var = "red", # Couleur des variables
col.ind = "#696969" # Couleur des individues
)
(t-SNE) t-Distributed Stochastic Neighbor Embedding is a non-linear dimensionality reduction algorithm used for exploring high-dimensional data.
library(Rtsne)
tsne <- Rtsne(vehicle, dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 1711 x 17 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.39 seconds (sparsity = 0.067740)!
## Learning embedding...
## Iteration 50: error is 72.309172 (50 iterations in 0.35 seconds)
## Iteration 100: error is 63.821468 (50 iterations in 0.24 seconds)
## Iteration 150: error is 63.619598 (50 iterations in 0.24 seconds)
## Iteration 200: error is 63.610323 (50 iterations in 0.25 seconds)
## Iteration 250: error is 63.609266 (50 iterations in 0.24 seconds)
## Iteration 300: error is 1.160615 (50 iterations in 0.22 seconds)
## Iteration 350: error is 0.927751 (50 iterations in 0.21 seconds)
## Iteration 400: error is 0.849097 (50 iterations in 0.22 seconds)
## Iteration 450: error is 0.821106 (50 iterations in 0.23 seconds)
## Iteration 500: error is 0.807949 (50 iterations in 0.23 seconds)
## Fitting performed in 2.42 seconds.
par(mfrow=c(2,2))
plot(tsne$Y, main="tsne", pch=21, bg="black")
plot(res.pca$ind$coord[,1:2], main = "PCA", pch=21, bg="black")
plot(tsne$Y, main="tsne", pch=21, bg= vehicle$category)
plot(res.pca$ind$coord[,1:2], main = "PCA", pch=21, bg= vehicle$category) # color by category (highest number of trips
# library(tourr)
# X11() # Windows
# animate_xy(vehicle, axes = "off")
New pca, whithout variables “morning”,“date”,“night”
new.pca = PCA(vehicle[,-c(1:3,9,15)], graph=F)
gridExtra::grid.arrange(
fviz_pca_biplot(res.pca,
col.var = "black", # Couleur des variables
col.ind = vehicle$category, # Couleur des individues
title = "with morning day night"),
fviz_pca_biplot(new.pca,
col.var = "black", # Couleur des variables
col.ind = vehicle$category, # Couleur des individues
title = "without morning day night"),
ncol=2)
Apparemment la présence de ces variables aide un peu, pas trop, ici.
# manual scale figure
#
# library(tidyverse)
# library(tidylog)
#
# plot_data = as.data.frame(ac$scores[,1:2]) %>%
# bind_cols(as_tibble(res.fcm$u)) %>%
# pivot_longer(cols=starts_with('Cluster'), names_to='Cluster', values_to='Prob')
#
#
#
# plot_data = vehicle %>%
# pivot_longer(cols=c(morning,day,night))
#
# ggplot(plot_data, aes(NbrTrips, distDA, color=name, alpha=value)) +
# geom_point(size=2, shape=16) +
# scale_color_manual(values=c(`morning`='red', `day`='green', `night`='blue')) +
# guides(alpha='none') +
# theme_minimal()
#
# p <- ggplot(vehicle, aes(NbrTrips, AvgDuration)) +
# geom_point(aes(colour = factor(category)))
features = features[-length(features)]
set.seed(1812)
res.km = kmeans(vehicle[,features],3, nstart = 20)
par(mfrow=c(1,2))
plot(tsne$Y, main="tsne", pch=21, bg= res.km$cluster)
plot(res.pca$ind$coord[,1:2], main = "PCA", pch=21, bg= res.km$cluster) # color by category (highest number of trips
les clusters sont séparés dans la visualisation du tsne!
Regardant la répartition dans les clusters
vehicle_tomelt = add_column(vehicle, cluster = res.km$cluster)
vehicle_tomelt$cluster = as.factor(vehicle_tomelt$cluster)
features = c(features,"cluster")
vehicle_tomelt = vehicle_tomelt[,features]
vehicle_melted = reshape2::melt(vehicle_tomelt, id.var = "cluster")
vehicle_melted$cluster = as.factor(vehicle_melted$cluster)
p <- ggplot(data = vehicle_melted, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=cluster))
p + facet_wrap( ~ variable, scales="free")
p = GGally::ggpairs(vehicle_tomelt,mapping=ggplot2::aes(colour = cluster))
plots <- lapply(1:p$ncol, function(j) GGally::getPlot(p, i = 10, j = j))
GGally::ggmatrix(plots, nrow=1, ncol=10 , xAxisLabels = p$xAxisLabels[1:p$ncol])
features = c("morning","night","day","NbrTrips","AvgDurationHour","AvgWaitingHour","distDA","meanRatio","TotalDurationHour")
library(mclust)
gmm <- Mclust(vehicle[,features])
plot(gmm, what = "BIC")
Si on laisse Mclust choisir automatiquement il choisit 1 cluster !
Essayons avec 3 ou 4 clusters (gaussiennes)
gmm3 <- Mclust(vehicle[,features], G = 3)
table(gmm3$classification)
##
## 1 2 3
## 795 47 869
gmm4 <- Mclust(vehicle[,features], G = 4)
table(gmm4$classification)
##
## 1 2 3 4
## 799 334 48 530
gmm5 <- Mclust(vehicle[,features], G = 5)
table(gmm5$classification)
##
## 1 2 3 4 5
## 544 125 57 233 752
il y a toujours un cluster de minorité, environ 47 vehicules
Choisisson donc 3 clusters
par(mfrow=c(1,2))
plot(tsne$Y, main="tsne", pch=21, bg= gmm3$classification)
plot(res.pca$ind$coord[,1:2], main = "PCA", pch=21, bg= gmm3$classification) # color by category (highest number of trips
Si on regarde la figure à droite (acp) on voit que le cluster minoré est quand même intéressant, il s’agit des voitures extremes
Regardons le biplot
p = fviz_pca_biplot(res.pca,
col.var = "black", # Couleur des variables
col.ind = as.factor(gmm3$classification), # Couleur des individues
title = "clustering by gmm")
p + scale_color_brewer(palette="Dark2")
Mmmmm ?
vehicle_tomelt = add_column(vehicle, cluster = gmm3$classification)
vehicle_tomelt$cluster = as.factor(vehicle_tomelt$cluster)
features = c(features,"cluster")
vehicle_tomelt = vehicle_tomelt[,features]
vehicle_melted = reshape2::melt(vehicle_tomelt, id.var = "cluster")
vehicle_melted$cluster = as.factor(vehicle_melted$cluster)
p <- ggplot(data = vehicle_melted, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=cluster))
p + facet_wrap( ~ variable, scales="free")
p = GGally::ggpairs(vehicle_tomelt,mapping=ggplot2::aes(colour = cluster))
plots <- lapply(1:p$ncol, function(j) GGally::getPlot(p, i = 10, j = j))
GGally::ggmatrix(plots, nrow=1, ncol=10 , xAxisLabels = p$xAxisLabels[1:p$ncol])
D’autres méthodes de clustering.. possible de les appliquer si on souhaite
gtm som
Mesure de performance de clustering, en terme de séparation de clusters etc… : validity measures